import glob
import os
import random

from tqdm import tqdm

from gxl_ai_utils.utils import utils_file
def convert():
    """"""
    # imput_cuts = "/home/work_nfs8/xlgeng/new_workspace/gxl_ai_utils/eggs/cats_and_dogs/icefall_assistant/en_cn/gxl_data/3000h/fbank_common/3000h_cuts_train.jsonl"
    # dict_list = utils_file.load_dict_list_from_jsonl(imput_cuts)
    # res_dict_list = []
    # for dict_i in tqdm(dict_list):
    #     dict_i.pop("features")
    #     res_dict_list.append(dict_i)
    # output_path = "/home/work_nfs8/xlgeng/new_workspace/gxl_ai_utils/eggs/cats_and_dogs/icefall_datahandle/manifest2cuts/data/cuts/3000h/3000h_cuts_train.jsonl"
    # utils_file.write_dict_list_to_jsonl(res_dict_list, output_path)
    # gz_path = output_path + ".gz"
    # utils_file.do_compress_file_by_gzip(output_path, gz_path)

    input_dir = "/home/work_nfs8/xlgeng/new_workspace/gxl_ai_utils/eggs/cats_and_dogs/icefall_datahandle/manifest2cuts/data/cuts"
    datanames_list = ['3000h','wenetspeech_0','wenetspeech_1']
    res_path_list = []
    res_list = []
    for dataname in datanames_list:
        input_dir_dataname = os.path.join(input_dir, dataname)
        cuts_path_i = glob.glob(os.path.join(input_dir_dataname, '*_cuts_train.jsonl'))[0]
        res_path_list.append(cuts_path_i)
        item_list = utils_file.load_list_file_clean(cuts_path_i)
        res_list.extend(item_list)
    random.shuffle(res_list)
    output_path = "/home/work_nfs8/xlgeng/new_workspace/gxl_ai_utils/eggs/cats_and_dogs/icefall_datahandle/manifest2cuts/data/cuts/all_13000_2/all_13000_cuts_train.jsonl"
    gz_path = output_path + '.gz'
    utils_file.write_list_to_file(res_list, output_path)
    utils_file.do_compress_file_by_gzip(output_path, gz_path)



if __name__ == '__main__':
    convert()

